### Replication code for Taylor C. Boas, F. Daniel Hidalgo, and Guillermo Toral. "Competence versus Priorities: Negative Electoral Responses to Education Quality in Brazil"
### This file replicates the analyses, figures, and tables of the regression discontinuity design included in the main text and in the Appendices, It uses the dataset rdd_dataset.csv, which can be built with the code included in rdd_dataset.R
### R version, platform, and package versions reported at the end of the file
### June 24, 2020

# The following packages need to be installed: XML, Hmisc, readxl, haven, car, stargazer

library(haven); library(car); library(Hmisc); library(readxl); library(XML); library(stargazer)

# Set Working Directory to directory containing this file.

# Clean environment

rm(list=ls(all=T))

# ================================================================================
# Appendix Table D.32 – Most Serious Problem Facing the Country: AmericasBarometer
# ================================================================================

# Download the AmericasBarometer survey data files for Brazil (Stata format), years 2007--2019, from https://www.vanderbilt.edu/lapop/data-access.php and save them in the working directory. If the filenames differ from those listed below, you may have a revised version of these files, and some modification of the code may be necessary to replicate the results.

lapop07<-read_dta('./data/2138048899brazil_lapop_dims final 2007 v5.dta')
lapop08<-read_dta('./data/30541815brazil_lapop_dims_2008_final_data_set_v10.dta')
lapop10<-read_dta('./data/7948266051039660950Brazil_LAPOP_AmericasBarometer 2010 data set  approved v4.dta')
lapop12<-read_dta('./data/54861031Brazil LAPOP AmericasBarometer 2012 Rev1_W.dta')
lapop14<-read_dta('./data/636339374Brazil LAPOP AmericasBarometer 2014 v3.0_W.dta')
lapop17<-read_dta('./data/780314464Brazil LAPOP AmericasBarometer 2017 V1.0_W.dta')
lapop19<-read_dta('./data/Brazil LAPOP AmericasBarometer 2019 v1.0_W.dta')

# Recode biggest problem question for each year, combining similar categories into a common set of top problems

lapop07$problem<-recode(as_factor(lapop07$A4), "'Desemprego/falta de emprego'='Unemployment';c('Violência','Segurança (falta de)','Delinquência, crime, violência')='Security';'Corrupção'='Corruption';'Saúde, falta de serviço'='Health';c('Pobreza','Fome','Desigualdade','Desnutrição')='Poverty/Inequality';'Educação, falta de, má qualidade'='Education';c('Os políticos','Mal governo')='Government';c('Uso de drogas','Narcotráfico')='Drugs';c('Economia, problemas com, crise de','Inflação, altos preços')='Economy';else='Other'",as.factor=T)

lapop08$problem<-recode(as_factor(lapop08$a4), "'Desemprego/falta de emprego'='Unemployment';c('Violência','Segurança (falta de)','Delinquência, crime, violência')='Security';'Corrupção'='Corruption';'Saúde, falta de serviço'='Health';c('Pobreza','Fome','Desigualdade','Desnutrição')='Poverty/Inequality';'Educação, falta de, má qualidade'='Education';c('Os políticos','Mal governo')='Government';c('Uso de drogas','Narcotráfico')='Drugs';c('Economia, problemas com, crise de','Inflação, altos preços')='Economy';else='Other'",as.factor=T)

lapop10$problem<-recode(as_factor(lapop10$a4), "'Desemprego/falta de emprego'='Unemployment';c('Violência','Segurança (falta de)','Delinquência, crime, violência')='Security';'Corrupção'='Corruption';'Saúde, falta de serviço'='Health';c('Pobreza','Fome','Desigualdade','Desnutrição')='Poverty/Inequality';'Educação, falta de, má qualidade'='Education';c('Os políticos','Mau governo')='Government';c('Uso de drogas','Tráfico de drogas')='Drugs';c('Economia, problemas com, crise de','Inflação, altos preços')='Economy';else='Other'",as.factor=T)

lapop12$problem<-recode(as_factor(lapop12$a4), "'Desemprego/falta de emprego'='Unemployment';c('Violência','Segurança (falta de)','Delinquência, crime, violência')='Security';c('Corrupção','Impunidade')='Corruption';'Saúde, falta de serviço'='Health';c('Pobreza','Fome','Desigualdade','Desnutrição')='Poverty/Inequality';'Educação, falta de, má qualidade'='Education';c('Os políticos','Mau governo')='Government';c('Uso de drogas','Tráfico de drogas')='Drugs';c('Economia, problemas com, crise de','Inflação, altos preços')='Economy';else='Other'",as.factor=T)

lapop14$problem<-recode(as_factor(lapop14$a4), "'Desempleo/falta de empleo'='Unemployment';c('Violencia','Seguridad (falta de)','Delincuencia, crimen')='Security';c('Corrupción','Impunidad')='Corruption';'Salud, falta de servicio'='Health';c('Pobreza','Desigualdad','Desnutrición')='Poverty/Inequality';'Educación, falta de, mala calidad'='Education';c('Los políticos','Mal gobierno')='Government';c('Drogas, consumo de drogadicción','Narcotráfico')='Drugs';c('Economía, problemas con, crisis de','Inflación, altos precios')='Economy';else='Other'",as.factor=T)

lapop17$problem<-recode(as_factor(lapop17$a4), "'Unemployment'='Unemployment';c('Violence','Security (lack of)','Crime')='Security';c('Corruption','Impunity')='Corruption';'Health services, lack of'='Health';c('Poverty','Inequality','Desnutrición')='Poverty/Inequality';'Education, lack of, poor quality'='Education';c('Politicians','Bad government')='Government';c('Drug addiction, consumption of drugs','Drug trafficking')='Drugs';c('Economy, problems with, crisis of','Inflation, high prices')='Economy';else='Other'",as.factor=T)

lapop19$problem<-recode(as_factor(lapop19$a4), "'Desempleo/falta de empleo'='Unemployment';c('Violencia','Seguridad (falta de)','Delincuencia, crimen')='Security';c('Corrupción','Impunidad')='Corruption';'Salud, falta de servicio'='Health';c('Pobreza','Desigualdad','Desnutrición')='Poverty/Inequality';'Educación, falta de, mala calidad'='Education';c('Los políticos','Mal gobierno')='Government';c('Drogas, consumo de, drogadicción','Narcotráfico')='Drugs';c('Economía, problemas con, crisis de','Inflación, altos precios')='Economy';else='Other'",as.factor=T)

# Combine into a table

problem.table<-t(rbind(prop.table(table(lapop19$problem)), prop.table(table(lapop17$problem)), prop.table(table(lapop14$problem)), prop.table(table(lapop12$problem)), prop.table(table(lapop10$problem)), prop.table(table(lapop08$problem)), prop.table(table(lapop07$problem))))
problem.table<-data.frame(apply(problem.table,1,mean), problem.table)
names(problem.table)<-c('Average',2019,2017,2014,2012,2010,2008,2007)

# Remove "Government" and "Other" and sort by average

problem.table<-problem.table[-1*which(rownames(problem.table) %in% c('Government','Other')),]
problem.table<-problem.table[order(problem.table$Average,decreasing=T),]
problem.table<-round(100*problem.table,1)

# Write table

problem_table_latex<-latex(problem.table,file='./tables/problem_table.tex', caption='Most Serious Problem Facing the Country: AmericasBarometer', rowlabel='',collabel.just=rep('c',ncol(problem.table)),col.just = rep('c', ncol(problem.table)), booktabs = F, ctable = T, where = "htp")

# ====================================================================
# Appendix Figure D.15 – Biggest Problem and Biggest Campaign Issue in
# the Municipality: Pernambuco Survey
# ====================================================================

baseline<-read_spss('../rct/data/Banco de Dados - Projeto Boston University (3193 CASOS).sav')
endline<-read_spss('../rct/data/Banco Boston University (2577 casos).sav')

baseline$biggest_problem<-ifelse(as_factor(baseline$p13)=='Não sabe / Não respondeu', NA, as.character(as_factor(baseline$p13)))
endline$biggest_problem<-ifelse(as_factor(endline$p10)=='NS/NR', NA, as.character(as_factor(endline$p10)))

campaign_issues<-100* sort(prop.table(table(endline$biggest_problem)),decreasing=T)[c(1:8,10)]
biggest_problem<-100* sort(prop.table(table(baseline$biggest_problem)),decreasing=T)[c(1:5,7:9,13)]
names(biggest_problem)[9]<-'Corrupção/contas do município'
campaign_issues<-campaign_issues[names(biggest_problem)]
issue_data<-rbind(biggest_problem, campaign_issues)
colnames(issue_data)<-c('Health','Crime','Jobs','Drought','Sanitation','Schools','Street\nPaving',"Poor\nGovernment",'Corruption/\nAccounts')
rownames(issue_data)<-c('Biggest Problem','Biggest Campaign Issue')

pdf(file='./figures/biggest_problem.pdf',height=4,width=6)
par(mar=c(6,4,1,1))
barplot(issue_data,ylim=c(0,50),ylab='Percent',beside=T,las=2,legend.text=T,args.legend=list(x='topright'))
dev.off()

# ======================================================================
# Appendix Table D.33 – Predictors of Education as a Problem or Priority
# ======================================================================

# Load data to be merged into survey data files

load('../rct/data/data_rosettastone.RData')

ideb<-data.frame(read_excel('../rdd/data/divulgacao_anos_iniciais_municipios2017-atualizado-Jun_2019.xlsx',skip=9))
ideb<-ideb[which(ideb$REDE=='Municipal'),grep('^COD_MUN|^IDEB14|PROJ14',names(ideb))]
ideb<-data.frame(apply(ideb,2,as.numeric))
ideb_gap<-with(ideb,data.frame(codeibge=rep(COD_MUN,6),ideb_yr=c(rep(2007,nrow(ideb)),rep(2009,nrow(ideb)),rep(2011,nrow(ideb)),rep(2013,nrow(ideb)),rep(2015,nrow(ideb)),rep(2017,nrow(ideb))), ideb_gap=c(IDEB14_07-PROJ14_07, IDEB14_09-PROJ14_09, IDEB14_11-PROJ14_11, IDEB14_13-PROJ14_13, IDEB14_15-PROJ14_15, IDEB14_17-PROJ14_17))) 

# Prepare LAPOP combined dataset and run logit regression

lapop08$yr<-2008
lapop10$yr<-2010
lapop12$yr<-2012
lapop14$yr<-2014
lapop17$yr<-2017
lapop19$yr<-2019

lapop08$ideb_yr<-2007
lapop10$ideb_yr<-2009
lapop12$ideb_yr<-2011
lapop14$ideb_yr<-2013
lapop17$ideb_yr<-2017
lapop19$ideb_yr<-2017

lapop08$mun_tse2006<-gsub('^ | $','',as_factor(lapop08$municipio))
lapop08$state<-as.character(as_factor(lapop08$prov))
lapop08$mun_tse2006[lapop08$mun_tse2006=='FLORIANÓPOLIS']<-'FLORIANOPOLIS'
lapop08$mun_tse2006[lapop08$mun_tse2006=='RIO PRETO EVA']<-'RIO PRETO DA EVA'
lapop08$state[lapop08$mun_tse2006=='FATIMA DO SUL']<-'MS' # Miscoding in data file
unique(paste(lapop08$state,lapop08$mun_tse2006))[(! unique(paste(lapop08$state,lapop08$mun_tse2006)) %in% paste(br$state, br$mun_tse2006))]
lapop08<-merge(lapop08,br[,c('codeibge','state','mun_tse2006')])
lapop10$codeibge<-substr(lapop10$municipio,3,9)
# 2012: Municipio codes are IBGE codes, with '15' appended to the front of each, except for 3550308 (São Paulo), 355038 (apparently São Paulo with data entry error), and 3515004 (Embu das Artes, SP). 15355030 is apparently São Paulo as well. Verified in these cases that state is SP and tamano is Capital.
lapop12$codeibge<-gsub('^15','', lapop12$municipio)
lapop12$codeibge[lapop12$codeibge %in% c('355038', '355030')]<-'3550308'
lapop14$codeibge<-paste0(substr(lapop14$prov,3,4),substr(lapop14$municipio,3,9))
lapop17$codeibge<-lapop17$municipio
lapop19$codeibge<-substr(lapop19$municipio,3,9)

lapop08$white<-as_factor(lapop08$etid)=='Branca'
lapop08$black<-as_factor(lapop08$etid)=='Preta'
lapop10$white<-as_factor(lapop10$etid)=='Branca'
lapop10$black<-as_factor(lapop10$etid)=='Preta'
lapop12$white<-as_factor(lapop12$etid)=='Branca'
lapop12$black<-as_factor(lapop12$etid)=='Preta'
lapop14$white<-as_factor(lapop14$etid)=='Blanco'
lapop14$black<-as_factor(lapop14$etid)=='Negro'
lapop17$white<-as_factor(lapop17$etid)=='White'
lapop17$black<-as_factor(lapop17$etid)=='Black'
lapop19$white<-as_factor(lapop19$etid)=='Blanca'
lapop19$black<-as_factor(lapop19$etid)=='Negra'

lapop08$female<-lapop08$q1==2
lapop10$female<-lapop10$q1==2
lapop12$female<-lapop12$q1==2
lapop14$female<-lapop14$q1==2
lapop17$female<-lapop17$q1==2
lapop19$female<-lapop19$q1==2

lapop08$reg<-factor(lapop08$estratopri,levels=1501:1505,labels=c('North','Northeast','Center-West','Southeast','South'))
lapop10$reg<-factor(lapop10$estratopri,levels=1501:1505,labels=c('North','Northeast','Center-West','Southeast','South'))
lapop12$reg<-factor(lapop12$estratopri,levels=1501:1505,labels=c('North','Northeast','Center-West','Southeast','South'))
lapop14$reg<-factor(lapop14$estratopri,levels=1501:1505,labels=c('North','Northeast','Center-West','Southeast','South'))
lapop17$reg<-factor(lapop17$estratopri,levels=1501:1505,labels=c('North','Northeast','Center-West','Southeast','South'))
lapop19$reg<-factor(lapop19$estratopri,levels=1501:1505,labels=c('North','Northeast','Center-West','Southeast','South'))

lapop08$ed<-as.numeric(lapop08$ed)
lapop10$ed<-as.numeric(lapop10$ed)
lapop12$ed<-as.numeric(lapop12$ed)
lapop14$ed<-as.numeric(lapop14$ed)
lapop17$ed<-as.numeric(lapop17$ed)
lapop19$ed<-as.numeric(lapop19$ed)

lapop08$age<-as.numeric(lapop08$q2)
lapop10$age<-as.numeric(lapop10$q2)
lapop12$age<-as.numeric(lapop12$q2)
lapop14$age<-as.numeric(lapop14$q2)
lapop17$age<-as.numeric(lapop17$q2)
lapop19$age<-as.numeric(lapop19$q2)

lapop<-rbind(lapop08[,c('codeibge','ideb_yr','yr','reg','ed','female','age','white','black','problem')], lapop10[,c('codeibge','ideb_yr','yr','reg','ed','female','age','white','black','problem')], lapop12[,c('codeibge','ideb_yr','yr','reg','ed','female','age','white','black','problem')], lapop14[,c('codeibge','ideb_yr','yr','reg','ed','female','age','white','black','problem')], lapop17[,c('codeibge','ideb_yr','yr','reg','ed','female','age','white','black','problem')], lapop19[,c('codeibge','ideb_yr','yr','reg','ed','female','age','white','black','problem')])

lapop$yr<-factor(lapop$yr)
lapop$eduprob<-lapop$problem=='Education'

lapop<-merge(lapop, ideb_gap)

mod1<-glm(eduprob ~ ed + age + female + white + black + reg + yr + ideb_gap, family=binomial(link='logit'), data= lapop)

# Prepare Pernambuco panel survey file and run logit regression

baseline<-read_spss('../rct/data/Banco de Dados - Projeto Boston University (3193 CASOS).sav')
r1_data<-read.csv('../rct/data/data_questionnaire_panel.csv',as.is=T)
baseline<-merge(baseline,r1_data[,c('qnum','codeibge','area1'),],by.x='quest',by.y='qnum')
baseline$eduprob<-as_factor(baseline$p13)=='Educação/escolas/creches'
baseline$age<-baseline$p2
baseline$female<-baseline$p1==2
baseline$white<-baseline$p45==1
baseline$black<-baseline$p45==2
baseline$ed<-ifelse(baseline$p44==99,NA, baseline$p44)
baseline<-merge(baseline,ideb_gap[ideb_gap$ideb_yr==2015,])

mod2<-glm(eduprob ~ ed + age + female + white + black + ideb_gap, family=binomial(link='logit'), data= baseline)

# Prepare online survey file and run logit regression

reg<-readHTMLTable('./data/ESTADOS E CAPITAIS DO BRASIL.html',colClasses='character',stringsAsFactors=F)[[1]][,c('Sigla','Região')]
names(reg)<-c('uf','region')

load('./data/municipios.RData')

online<-read.csv('../online_survey/data/online_survey_anonymized.csv',as.is=T)
online<-online[-1*c(1:2),]
online_text<-read.csv('../online_survey/data/online_survey_text_anonymized.csv',as.is=T)
online_text<-online_text[-1*c(1:2),]

# Eliminate non-eligible respondents, those who did not finish, and those from a municipio with no IDEB data.

online_text<-online_text[which(online$ip_country=='Brazil'&online$p1==1&online$p2==1&online$p3==1&as.numeric(online$p4)>1&online$p15!=''&online$ideb!=''),]
online<-online[which(online$ip_country=='Brazil'&online$p1==1&online$p2==1&online$p3==1&as.numeric(online$p4)>1&online$p15!=''&online$ideb!=''),]

online$uf<-online_text$p5
online$muni<-apply(online_text[,grep('p6',names(online_text))],1,function(x) x[!is.na(x)])
munis$codeibge<-munis$'Código Município Completo'
munis$uf<-munis$Sigla_UF
munis$muni<-munis$Nome_Município
online<-merge(online,munis[,c('uf','muni','codeibge')])
online<-merge(online,reg)
online<-merge(online,ideb_gap[ideb_gap$ideb_yr==2017,])

online$age<-as.numeric(online$p4)+16
online$female<-online$p14==2
online$white<-online$p13==1
online$black<-online$p13==2
online$ed<-as.numeric(online$p15)-10
online$edu_priority<-as.numeric(online$p7_1) <= median(as.numeric(online$p7_1))
online$reg<-factor(online$reg, levels=c('Norte','Nordeste','Centro-Oeste','Sudeste','Sul'), labels=c('North','Northeast','Center-West','Southeast','South'))

mod3<-glm(edu_priority ~ ed + age + female + white + black + reg + ideb_gap, family=binomial(link='logit'), data=online)

# Write combo table

stargazer(mod1, mod2, mod3, type='latex',star.cutoffs=c(0.1,0.05,0.01), 
covariate.labels=c('Education','Age','Female','White','Black','IDEB Gap'),
column.labels=c('Problem (LAPOP)','Problem (Panel)','Priority (Online)'),
dep.var.labels.include=F,
dep.var.caption='DV: Education as a...',
omit= setdiff(names(coef(mod1)),names(coef(mod2))),
label='tab:edu_prob',
title='Predictors of Education as a Problem or Priority',
notes='',notes.label='',notes.append=F,
out='./tables/edu_problem_models.tex')

# =======================================
# Appendix Table D.31 – Sample Statistics
# =======================================

# RDD sample and population

muni_elect<-unique(read.csv('../rdd/data/muns_bandwidth_reelection.csv',colClasses='character')[,'cod_ibge'])

age<-data.frame(read_excel('./data/age_muni_18plus_2010.xlsx',skip=7,na='-'))
names(age)<-c('codeibge','muni','total',paste0('age',18:100))
age$uf<-gsub('.*\\(([A-Z]{2})\\).*','\\1',age$muni)
age$muni<-gsub(' \\([A-Z]{2}\\)','',age$muni)
age<-age[1:5565,c('codeibge','uf','muni','total',paste0('age',18:100))]
age[is.na(age)]<-0

cumpct_age<-data.frame(codeibge=age$codeibge,uf=age$uf,muni=age$muni,100 * t(apply(age[,grep('age',names(age))],1,cumsum)) / apply(age[,grep('age',names(age))],1,sum))
med_age_muni<-data.frame(codeibge= cumpct_age$codeibge,uf= cumpct_age$uf,muni= cumpct_age$muni,med_age=apply(cumpct_age[,grep('age',names(cumpct_age))], 1, function(x) as.numeric(substr(names(which.min(x[x>50])),4,5))))
med_age_elect<-mean(med_age_muni$med_age[substr(med_age_muni$codeibge,1,6) %in% muni_elect])
med_age_munis<-mean(med_age_muni$med_age)
age_pop<-apply(age[,grep('age',names(age))],2,sum,na.rm=T)
cumpct_age_pop<-100*cumsum(age_pop)/sum(age_pop)
med_age_pop<-as.numeric(substr(names(which.min(cumpct_age_pop[cumpct_age_pop > 50])),4,5))

muni_pop<-data.frame(codeibge=age$codeibge,uf=age$uf,total=age$total,adult=apply(age[,grep('age',names(age))],1,sum,na.rm=T))
muni_pop<-merge(muni_pop,reg)
reg_pop<-with(muni_pop, aggregate(list(adult=adult),list(region=region),sum))
reg_pop_pct<-100*prop.table(reg_pop$adult)
names(reg_pop_pct)<-reg_pop$region
reg_munis_pct<-100*prop.table(table(muni_pop$region))
reg_elect_pct<-100*prop.table(table(muni_pop$region[substr(muni_pop$codeibge,1,6) %in% muni_elect]))

# Median municipal population: sort municipalities in increasing order of total population
muni_pop<-muni_pop[order(muni_pop$total),]
# Calculate each municipality's percent of Brazil's total adult population
muni_pop$pct<-100*muni_pop$adult/sum(muni_pop$adult)
# Calculate percent of all Brazilian adults living in a municipality with that population or less 
muni_pop$cum.pct<-cumsum(muni_pop$pct)
# Total municipal population associated with the median adult 
med_muni_pop<-muni_pop$total[which(muni_pop$cum.pct==min(muni_pop$cum.pct[muni_pop$cum.pct >= 50]))]

med_muni_elect<-median(muni_pop$total[substr(muni_pop$codeibge,1,6) %in% muni_elect])
med_muni_munis<-median(muni_pop$total)

sex<-data.frame(read_excel('./data/sex_muni_18plus_2010.xlsx',skip=7,na='-'))
names(sex)[1:2]<-c('codeibge','muni')
sex$uf<-gsub('.*\\(([A-Z]{2})\\).*','\\1',sex$muni)
sex$muni<-gsub(' \\([A-Z]{2}\\)','',sex$muni)
sex<-sex[1:5565,c('codeibge','uf','muni',setdiff(names(sex),c('codeibge','uf','muni')))]
male_pct<-data.frame(codeibge= sex$codeibge,uf= sex$uf,muni= sex$muni,male_pct=100*sex$Homem/sex$Total)
male_pop_pct<-100*sum(sex$Homem)/sum(sex$Total)
male_elect_pct<-mean(male_pct$male_pct[substr(male_pct$codeibge,1,6) %in% muni_elect])
male_munis_pct<-mean(male_pct$male_pct)

edu<-data.frame(read_excel('./data/educ_muni_18plus_2010.xlsx',skip=8,na='-'))
names(edu)[1:2]<-c('codeibge','muni')
edu$uf<-gsub('.*\\(([A-Z]{2})\\).*','\\1',edu$muni)
edu$muni<-gsub(' \\([A-Z]{2}\\)','',edu$muni)
edu<-edu[1:5565,c('codeibge','uf','muni',setdiff(names(edu),c('codeibge','uf','muni')))]
edu_pct<-data.frame(codeibge=edu$codeibge,uf=edu$uf,muni=edu$muni,100*prop.table(as.matrix(edu[,4:7]),1))
edu_elect_pct<-apply(edu_pct[substr(edu_pct$codeibge,1,6) %in% muni_elect,-1:-3], 2, mean)
edu_munis_pct<-apply(edu_pct[,-1:-3], 2, mean)
edu_pop_pct<-100*prop.table(apply(edu[,4:7],2,sum))
names(edu_elect_pct)<-names(edu_pop_pct)<-c('Less than Primary','Primary','Secondary','Higher')

race<-data.frame(read_excel('./data/race_muni_18plus_2010.xlsx',skip=7,na='-'))
names(race)[1:2]<-c('codeibge','muni')
race$Other<-apply(race[,c('Amarela','Indígena','Sem.declaração')],1,sum,na.rm=T)
race$uf<-gsub('.*\\(([A-Z]{2})\\).*','\\1',race$muni)
race$muni<-gsub(' \\([A-Z]{2}\\)','',race$muni)
race<-race[1:5565,c('codeibge','uf','muni',setdiff(names(race),c('codeibge','uf','muni')))]
race_pct<-data.frame(codeibge=race$codeibge,uf=race$uf,muni=race$muni,100*prop.table(as.matrix(race[,c(4,5,7,10)]),1))
race_elect_pct<-apply(race_pct[substr(race_pct$codeibge,1,6) %in% muni_elect,-1:-3], 2, mean, na.rm=T)
race_munis_pct<-apply(race_pct[,-1:-3], 2, mean, na.rm=T)
race_pop_pct<-100*prop.table(apply(race[,c(4,5,7,10)],2,sum,na.rm=T))

# Pernambuco survey

panel<-read.csv('../rct/data/panel_cleaned.csv')
panel$edu<-factor(1*(panel$years_edu >= 0 & panel$years_edu <= 8)+2*(panel$years_edu >= 9 & panel$years_edu <= 11)+3*(panel$years_edu >= 12 & panel$years_edu <= 15)+4*(panel$years_edu >=16),labels=c('Less than Primary','Primary','Secondary','Higher'))
panel$edu[is.na(panel$years_edu)]<-NA
panel<-merge(panel,muni_pop[,c('codeibge','total')])

med_age_panel<-median(panel$age,na.rm=T)
reg_panel_pct<-100*prop.table(table(factor('Nordeste',levels=c('Centro-Oeste','Nordeste','Norte','Sudeste','Sul'))))
male_panel_pct<-100* mean(panel$female==0)
edu_panel_pct<-100* prop.table(table(panel$edu))
race_panel_pct<-100* prop.table(table(panel$race))
race_panel_pct<-race_panel_pct[c('Branca','Preta','Parda','Other')]
med_muni_panel<-median(panel$total)

# Online survey

online<-merge(online,muni_pop[,c('codeibge','total')],all.x=T)
online$male<-!online$female
online$race<-factor(1*(online$p13==1)+2*(online$p13==2)+3*(online$p13==3)+4*(online$p13 %in% 4:6), labels=c('Branca','Preta','Parda','Other'))
online$edu<-factor(1*(online$p15 %in% 11:12)+2*(online$p15 %in% 13:14)+3*(online$p15 %in% c(15,16,18))+4*(online$p15 %in% c(17,19,20)), labels = c('Less than Primary','Primary','Secondary','Higher'))

med_age_online<-median(online$age,na.rm=T)
reg_online_pct<-100* prop.table(table(online$reg))
male_online_pct<-100* mean(online$male)
edu_online_pct<-100* prop.table(table(online$edu))
race_online_pct<-100* prop.table(table(online$race))
med_muni_online<-median(online$total,na.rm=T)

# Dataset and table

sample_stats<-data.frame(rdd_elect=c(med_muni_elect, reg_elect_pct, race_elect_pct, edu_elect_pct, med_age_elect, male_elect_pct), munis=c(med_muni_munis, reg_munis_pct, race_munis_pct, edu_munis_pct, med_age_munis, male_munis_pct), panel=c(med_muni_panel, reg_panel_pct, race_panel_pct, edu_panel_pct, med_age_panel, male_panel_pct), online=c(med_muni_online, reg_online_pct, race_online_pct, edu_online_pct, med_age_online, male_online_pct), brazil=c(med_muni_pop, reg_pop_pct, race_pop_pct, edu_pop_pct, med_age_pop, male_pop_pct))
rownames(sample_stats)<-c('Median Population','Center-West','Northeast','North','Southeast','South','White','Black','Brown','Other','Less than Primary','Primary','Secondary','Higher','Median Age','Male')

sample_stats_table<-apply(round(sample_stats,1),2,prettyNum,big.mark=',')

sample_stats_table_latex<-latex(sample_stats_table,file='./tables/sample_stats_table.tex', caption='Sample Statistics', rowlabel='',colheads=c('RDD:\nReelection','Census\n(Municipalities)','Panel\nSurvey','Online\nSurvey','Census\n(Individuals)'),extracolsize='normalsize',collabel.just=rep('r',5),col.just = rep('r',5),rgroup = c('Municipality','Region','Race','Education','Other'), n.rgroup = c(1,5,4,4,2), booktabs = T, ctable = T, where = "htp")

# NOTES -- R version, platform, and loaded packages -------------------------
# sessionInfo(package = NULL)
# R version 4.0.1 (2020-06-06)
# Platform: x86_64-apple-darwin17.0 (64-bit)
# Running under: macOS High Sierra 10.13.6

# Matrix products: default
# BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
# LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib

# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

# attached base packages:
# [1] stats     graphics  grDevices utils     datasets  methods   base     

# other attached packages:
 # [1] stargazer_5.2.2 XML_3.99-0.3    readxl_1.3.1    Hmisc_4.4-0     ggplot2_3.3.1  
 # [6] Formula_1.2-3   survival_3.1-12 lattice_0.20-41 car_3.0-8       carData_3.0-4  
# [11] haven_2.3.1    

# loaded via a namespace (and not attached):
 # [1] Rcpp_1.0.4.6        png_0.1-7           digest_0.6.25       R6_2.4.1           
 # [5] cellranger_1.1.0    backports_1.1.7     acepack_1.4.1       pillar_1.4.4       
 # [9] rlang_0.4.6         curl_4.3            rstudioapi_0.11     data.table_1.12.8  
# [13] rpart_4.1-15        Matrix_1.2-18       checkmate_2.0.0     splines_4.0.1      
# [17] readr_1.3.1         stringr_1.4.0       foreign_0.8-80      htmlwidgets_1.5.1  
# [21] munsell_0.5.0       compiler_4.0.1      xfun_0.14           pkgconfig_2.0.3    
# [25] base64enc_0.1-3     htmltools_0.4.0     nnet_7.3-14         tidyselect_1.1.0   
# [29] tibble_3.0.1        gridExtra_2.3       htmlTable_1.13.3    rio_0.5.16         
# [33] crayon_1.3.4        dplyr_1.0.0         withr_2.2.0         grid_4.0.1         
# [37] gtable_0.3.0        lifecycle_0.2.0     magrittr_1.5        scales_1.1.1       
# [41] zip_2.0.4           stringi_1.4.6       latticeExtra_0.6-29 ellipsis_0.3.1     
# [45] generics_0.0.2      vctrs_0.3.1         openxlsx_4.1.5      RColorBrewer_1.1-2 
# [49] tools_4.0.1         forcats_0.5.0       glue_1.4.1          purrr_0.3.4        
# [53] hms_0.5.3           jpeg_0.1-8.1        abind_1.4-5         colorspace_1.4-1   
# [57] cluster_2.1.0       knitr_1.28         
